### Load packages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
trips <- read_csv('https://sds-aau.github.io/SDS-master/M1/data/trips.csv')
trips %>% glimpse()
Rows: 46,510
Columns: 11
$ X1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,…
$ username <chr> "@lewellenmichael", "@lewellenmichael", "@lewellenmichael", "@lewellenmichael", "@waylandchin…
$ country <chr> "Mexico", "Mexico", "Mexico", "Jordan", "China", "Vietnam", "Hong Kong", "China", "China", "C…
$ country_code <chr> "MX", "MX", "MX", "JO", "CN", "VN", "HK", "CN", "CN", "CN", "TH", "MY", "KH", "VN", "IN", "IN…
$ country_slug <chr> "mexico", "mexico", "mexico", "jordan", "china", "vietnam", "hong-kong", "china", "china", "c…
$ date_end <date> 2018-06-15, 2018-06-03, 2017-11-05, 2017-08-07, 2017-03-18, 2017-02-16, 2016-09-01, 2016-08-…
$ date_start <date> 2018-06-04, 2018-05-31, 2017-11-01, 2017-07-24, 2017-02-17, 2016-09-02, 2016-08-02, 2016-07-…
$ latitude <dbl> 21, 19, 21, 31, 40, 10, 22, 22, 22, 18, 7, 3, 11, 10, 13, 26, 27, 27, 28, 28, 19, 11, 22, 22,…
$ longitude <dbl> -101, -99, -86, 35, 122, 106, 114, 114, 113, 109, 98, 101, 104, 106, 80, 75, 78, 78, 77, 77, …
$ place <chr> "Guanajuato", "Mexico City", "Cancun", "Amman", "Yingkou", "Ho Chi Minh City", "Shenzhen", "H…
$ place_slug <chr> "mexico", "mexico-city-mexico", "cancun-mexico", "amman-jordan", "china", "ho-chi-minh-city-v…
people <- read_csv('https://sds-aau.github.io/SDS-master/M1/data/people.csv')
people %>% glimpse()
Rows: 4,016
Columns: 6
$ X1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25…
$ username <chr> "@lewellenmichael", "@waylandchin", "@karan", "@skaboss217", "@apwn", "@samcalma", "@paulbre…
$ followers <dbl> 1, 0, 2, 0, 17, 3, 4, 2, 17, 2, 11, 11, 5, 8, 0, 9, 3, 5, 25, 1, 1, 1, 61, 2, 11, 0, 1, 2, 9…
$ following <dbl> 2, 2, 1, 1, 426, 3, 9, 3, 23, 2, 17, 6, 9, 7, 1, 6, 3, 34, 23, 4, 4, 4, 120, 2, 10, 2, 2, 5,…
$ work_raw <chr> "Software Dev, Startup Founder, Finance, Crypto, Product Manager, Education, Data, Ecommerce…
$ education_raw <chr> "High School, Bachelor's Degree", NA, NA, NA, NA, NA, NA, "High School, Bachelor's Degree, M…
countries <- read_csv( 'https://sds-aau.github.io/SDS-master/M1/data/countrylist.csv')
countries %>% glimpse()
Rows: 249
Columns: 3
$ alpha_2 <chr> "AF", "AX", "AL", "DZ", "AS", "AD", "AO", "AI", "AQ", "AG", "AR", "AM", "AW", "AU", "AT", "AZ",…
$ region <chr> "Asia", "Europe", "Europe", "Africa", "Oceania", "Europe", "Africa", "Americas", NA, "Americas"…
$ sub_region <chr> "Southern Asia", "Northern Europe", "Southern Europe", "Northern Africa", "Polynesia", "Souther…
countries <- read_csv( 'https://sds-aau.github.io/SDS-master/M1/data/countrylist.csv')
countries %>% glimpse()
Rows: 249
Columns: 3
$ alpha_2 <chr> "AF", "AX", "AL", "DZ", "AS", "AD", "AO", "AI", "AQ", "AG", "AR", "AM", "AW", "AU", "AT", "AZ",…
$ region <chr> "Asia", "Europe", "Europe", "Africa", "Oceania", "Europe", "Africa", "Americas", NA, "Americas"…
$ sub_region <chr> "Southern Asia", "Northern Europe", "Southern Europe", "Northern Africa", "Polynesia", "Souther…
cities <- read_delim('https://sds-aau.github.io/SDS-master/M1/data/nomad_cities.csv', delim = '\t')
cities %>% glimpse()
Rows: 781
Columns: 27
$ X1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 2…
$ coffee_in_cafe <dbl> 1.73, 0.85, 1.99, 1.88, 5.00, 4.00, 5.38, 5.00, 5.00, 4.03, 3.50, 2.69, 4.30, 4.97,…
$ cost_beer <dbl> 1.73, 0.85, 1.99, 1.88, 5.00, 4.00, 5.38, 5.00, 5.00, 4.03, 3.50, 2.69, 4.30, 4.97,…
$ cost_coworking <dbl> 152.41, 98.88, 159.13, 47.01, 200.00, 250.00, 161.30, 300.00, 490.00, 96.78, 300.00…
$ cost_expat <dbl> 1273, 780, 1653, 1640, 3309, 4325, 2197, 2691, 3764, 1859, 2760, 1357, 2075, 2167, …
$ cost_nomad <dbl> 1364, 777, 1639, 1545, 3028, 3238, 2554, 3503, 3427, 2245, 2956, 1681, 2528, 2408, …
$ female_friendly <dbl> 1.00, 0.80, 1.00, 1.00, 0.80, 0.80, 0.80, 0.80, 0.80, 0.80, 0.80, 0.76, 1.00, 1.00,…
$ fragile_states_index <chr> "52.7", "78.8", "40.8", "DotMap(__next__=DotMap())", "34", "34", "39.8", "34", "34"…
$ free_wifi_available <dbl> 0.40, 0.60, 0.60, 1.00, 0.60, 1.00, 0.60, 0.40, 1.00, 0.24, 0.60, 0.62, 0.76, 1.00,…
$ freedom_score <chr> "0.6", "0.2", "0.8", "0.6", "0.6", "0.6", "0.8", "0.6", "0.6", "0.8", "0.6", "0.8",…
$ friendly_to_foreigners <dbl> 0.60, 0.60, 0.80, 0.80, 0.80, 0.80, 0.80, 1.00, 1.00, 0.80, 0.80, 0.83, 0.40, 1.00,…
$ internet_speed <dbl> 31, 14, 15, 16, 118, 81, 18, 23, 55, 24, 99, 21, 38, 11, 19, 17, 5, 20, 15, 55, 101…
$ latitude <dbl> 47.497912, 18.787747, 50.075538, 25.091075, 30.267153, 25.761680, 40.416775, 45.523…
$ leisure <dbl> 0.80, 0.62, 1.00, 1.00, 1.00, 1.00, 0.60, 1.00, 0.60, 0.78, 0.80, 0.63, 0.60, 0.60,…
$ lgbt_friendly <dbl> 0.27, 0.60, 0.60, 0.80, 0.60, 1.00, 1.00, 0.80, 0.80, 1.00, 1.00, 0.64, 0.60, 1.00,…
$ life_score <dbl> 0.86, 0.75, 0.83, 0.93, 0.95, 1.00, 0.88, 0.95, 0.92, 0.85, 0.87, 0.84, 0.87, 0.89,…
$ longitude <dbl> 19.040235, 98.993128, 14.437800, 121.559834, -97.743061, -80.191790, -3.703790, -12…
$ nightlife <dbl> 1.00, 0.40, 1.00, 0.60, 1.00, 1.00, 0.80, 1.00, 1.00, 0.80, 0.60, 0.80, 0.60, 0.60,…
$ nomadScore <dbl> 1.00, 0.95, 0.94, 0.94, 0.94, 0.92, 0.90, 0.90, 0.89, 0.88, 0.88, 0.88, 0.87, 0.87,…
$ nomad_score <dbl> 1.00, 0.95, 0.94, 0.94, 0.94, 0.92, 0.90, 0.90, 0.89, 0.88, 0.88, 0.88, 0.87, 0.87,…
$ peace_score <chr> "0.8", "0.4", "0.8", "DotMap(__next__=DotMap())", "0.8", "0.8", "0.8", "0.8", "0.8"…
$ place <chr> "Budapest", "Chiang Mai", "Prague", "Taipei", "Austin", "Miami", "Madrid", "Portlan…
$ places_to_work <dbl> 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8…
$ press_freedom_index <chr> "28.17", "44.53", "16.66", "24.37", "22.49", "22.49", "19.92", "22.49", "22.49", "1…
$ racism <dbl> 0.40, 0.40, 0.42, 0.00, 0.80, 0.80, 0.60, 0.80, 0.80, 1.00, 0.80, 1.00, 0.40, 1.00,…
$ safety <dbl> 0.60, 0.80, 0.80, 1.00, 0.73, 0.73, 0.80, 0.80, 0.60, 0.80, 0.40, 0.80, 0.60, 0.80,…
$ weed <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
cities <- cities %>%
mutate(fragile_states_index = fragile_states_index %>% as.numeric(),
peace_score = peace_score %>% as.numeric(),
fredom_score = freedom_score %>% as.numeric(),
press_freedom_index = press_freedom_index %>% as.numeric())
cities %>% glimpse()
Rows: 781
Columns: 28
$ X1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 2…
$ coffee_in_cafe <dbl> 1.73, 0.85, 1.99, 1.88, 5.00, 4.00, 5.38, 5.00, 5.00, 4.03, 3.50, 2.69, 4.30, 4.97,…
$ cost_beer <dbl> 1.73, 0.85, 1.99, 1.88, 5.00, 4.00, 5.38, 5.00, 5.00, 4.03, 3.50, 2.69, 4.30, 4.97,…
$ cost_coworking <dbl> 152.41, 98.88, 159.13, 47.01, 200.00, 250.00, 161.30, 300.00, 490.00, 96.78, 300.00…
$ cost_expat <dbl> 1273, 780, 1653, 1640, 3309, 4325, 2197, 2691, 3764, 1859, 2760, 1357, 2075, 2167, …
$ cost_nomad <dbl> 1364, 777, 1639, 1545, 3028, 3238, 2554, 3503, 3427, 2245, 2956, 1681, 2528, 2408, …
$ female_friendly <dbl> 1.00, 0.80, 1.00, 1.00, 0.80, 0.80, 0.80, 0.80, 0.80, 0.80, 0.80, 0.76, 1.00, 1.00,…
$ fragile_states_index <dbl> 52.7, 78.8, 40.8, NA, 34.0, 34.0, 39.8, 34.0, 34.0, 39.8, 34.0, 29.2, 29.0, 21.3, 2…
$ free_wifi_available <dbl> 0.40, 0.60, 0.60, 1.00, 0.60, 1.00, 0.60, 0.40, 1.00, 0.24, 0.60, 0.62, 0.76, 1.00,…
$ freedom_score <chr> "0.6", "0.2", "0.8", "0.6", "0.6", "0.6", "0.8", "0.6", "0.6", "0.8", "0.6", "0.8",…
$ friendly_to_foreigners <dbl> 0.60, 0.60, 0.80, 0.80, 0.80, 0.80, 0.80, 1.00, 1.00, 0.80, 0.80, 0.83, 0.40, 1.00,…
$ internet_speed <dbl> 31, 14, 15, 16, 118, 81, 18, 23, 55, 24, 99, 21, 38, 11, 19, 17, 5, 20, 15, 55, 101…
$ latitude <dbl> 47.497912, 18.787747, 50.075538, 25.091075, 30.267153, 25.761680, 40.416775, 45.523…
$ leisure <dbl> 0.80, 0.62, 1.00, 1.00, 1.00, 1.00, 0.60, 1.00, 0.60, 0.78, 0.80, 0.63, 0.60, 0.60,…
$ lgbt_friendly <dbl> 0.27, 0.60, 0.60, 0.80, 0.60, 1.00, 1.00, 0.80, 0.80, 1.00, 1.00, 0.64, 0.60, 1.00,…
$ life_score <dbl> 0.86, 0.75, 0.83, 0.93, 0.95, 1.00, 0.88, 0.95, 0.92, 0.85, 0.87, 0.84, 0.87, 0.89,…
$ longitude <dbl> 19.040235, 98.993128, 14.437800, 121.559834, -97.743061, -80.191790, -3.703790, -12…
$ nightlife <dbl> 1.00, 0.40, 1.00, 0.60, 1.00, 1.00, 0.80, 1.00, 1.00, 0.80, 0.60, 0.80, 0.60, 0.60,…
$ nomadScore <dbl> 1.00, 0.95, 0.94, 0.94, 0.94, 0.92, 0.90, 0.90, 0.89, 0.88, 0.88, 0.88, 0.87, 0.87,…
$ nomad_score <dbl> 1.00, 0.95, 0.94, 0.94, 0.94, 0.92, 0.90, 0.90, 0.89, 0.88, 0.88, 0.88, 0.87, 0.87,…
$ peace_score <dbl> 0.8, 0.4, 0.8, NA, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0, 0.8, 0.6,…
$ place <chr> "Budapest", "Chiang Mai", "Prague", "Taipei", "Austin", "Miami", "Madrid", "Portlan…
$ places_to_work <dbl> 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8…
$ press_freedom_index <dbl> 28.17, 44.53, 16.66, 24.37, 22.49, 22.49, 19.92, 22.49, 22.49, 19.92, 22.49, 17.27,…
$ racism <dbl> 0.40, 0.40, 0.42, 0.00, 0.80, 0.80, 0.60, 0.80, 0.80, 1.00, 0.80, 1.00, 0.40, 1.00,…
$ safety <dbl> 0.60, 0.80, 0.80, 1.00, 0.73, 0.73, 0.80, 0.80, 0.60, 0.80, 0.40, 0.80, 0.60, 0.80,…
$ weed <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,…
$ fredom_score <dbl> 0.6, 0.2, 0.8, 0.6, 0.6, 0.6, 0.8, 0.6, 0.6, 0.8, 0.6, 0.8, 0.8, 0.8, 0.8, 0.6, 0.2…
# Variables for descriptives
vars.desc <- c("nomad_score", "cost_nomad", "places_to_work", "freedom_score", "friendly_to_foreigners", "life_score")
First, lets look at a classical correlation matrix.
ggcorr(cities[,vars.desc], label = TRUE, label_size = 3, label_round = 2, label_alpha = TRUE)
library(FactoMineR)
library(factoextra)
cities <- cities %>%
select(-X1) %>%
drop_na()
res_pca <- cities %>%
select_if(is_numeric) %>%
PCA(scale.unit = TRUE, graph = TRUE)
res_pca %>%
fviz_screeplot(addlabels = TRUE,
ncp = 10,
ggtheme = theme_gray())
res_pca %>%
fviz_pca_var(alpha.var = "cos2",
col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE,
ggtheme = theme_gray())
res_pca %>%
fviz_pca_biplot(alpha.ind = "cos2",
col.ind = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
geom = "point",
ggtheme = theme_gray())
cities %>%
drop_na() %>%
select_if(is_numeric) %>%
scale() %>%
fviz_nbclust(kmeans, method = "wss")
hc <- cities %>%
select_if(is_numeric) %>%
hcut(hc_func = "hclust",
k = 3,
stand = TRUE)
hc %>%
glimpse()
List of 12
$ merge : int [1:753, 1:2] -620 -673 -624 -560 -584 -619 -646 -72 -600 -688 ...
$ height : num [1:753] 0.208 0.282 0.322 0.376 0.44 ...
$ order : int [1:754] 49 63 109 320 198 139 178 10 4 5 ...
$ labels : NULL
$ method : chr "ward.D2"
$ call : language stats::hclust(d = x, method = hc_method)
$ dist.method: chr "euclidean"
$ cluster : int [1:754] 1 2 1 1 1 1 1 1 1 1 ...
$ nbclust : num 3
$ silinfo :List of 3
..$ widths :'data.frame': 754 obs. of 3 variables:
.. ..$ cluster : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
.. ..$ neighbor : num [1:754] 2 2 2 2 2 2 2 2 2 2 ...
.. ..$ sil_width: num [1:754] 0.378 0.372 0.365 0.364 0.362 ...
..$ clus.avg.widths: num [1:3] 0.2186 0.0856 0.1571
..$ avg.width : num 0.163
$ size : int [1:3] 350 239 165
$ data : num [1:754, 1:25] -0.814 -1.272 -0.679 0.889 0.368 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : NULL
.. ..$ : chr [1:25] "coffee_in_cafe" "cost_beer" "cost_coworking" "cost_expat" ...
..- attr(*, "scaled:center")= Named num [1:25] 3.29 3.29 209.08 1874.17 2304.65 ...
.. ..- attr(*, "names")= chr [1:25] "coffee_in_cafe" "cost_beer" "cost_coworking" "cost_expat" ...
..- attr(*, "scaled:scale")= Named num [1:25] 1.92 1.92 173.91 1256.66 1081.81 ...
.. ..- attr(*, "names")= chr [1:25] "coffee_in_cafe" "cost_beer" "cost_coworking" "cost_expat" ...
- attr(*, "class")= chr [1:2] "hclust" "hcut"
hc %>%
fviz_cluster(data = cities %>% select_if(is_numeric),
ggtheme = theme_gray())
hc$cluster
[1] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 2 1 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[57] 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 2 2 1 1 1 1
[113] 1 2 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 2 1 1 1 1 1 1 1 2 1 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1
[169] 2 2 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 2 2 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 2 2
[225] 2 1 1 1 2 1 1 2 1 1 2 2 1 1 1 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1
[281] 1 1 1 1 1 2 1 1 1 3 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 3 1 2 1 2 2 1 1 2 1 1 2 1 2 1 2 1 1 3 1 1 1 1
[337] 1 2 3 1 2 2 1 1 2 1 1 2 1 1 1 2 1 2 1 1 1 2 2 2 1 2 2 2 2 1 1 2 1 2 2 2 1 2 2 1 2 1 1 2 2 1 1 2 1 2 3 1 2 2 1 2
[393] 3 3 1 1 1 3 3 2 1 1 1 2 1 2 2 1 1 2 1 2 2 1 2 3 1 1 1 1 3 2 1 2 1 1 1 2 2 2 2 1 1 3 1 1 1 3 2 2 2 2 1 3 2 3 2 1
[449] 2 2 2 1 2 1 1 2 2 2 1 1 2 1 3 1 2 2 1 3 2 2 2 1 2 3 2 3 1 1 3 2 3 3 2 3 1 2 2 1 2 2 3 1 1 3 2 1 2 1 2 2 3 1 1 2
[505] 2 2 2 1 2 2 1 3 2 3 1 2 2 2 2 3 3 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 3 3 3 1 2 2 3 2 3 1 3 2 2 2 3 2 3
[561] 2 2 3 2 2 2 2 2 3 2 2 3 2 3 3 2 1 3 2 3 3 1 2 3 2 2 3 3 2 2 3 2 1 3 2 2 2 2 1 3 2 2 2 2 2 3 2 3 3 3 2 2 1 2 3 2
[617] 3 3 3 3 3 3 3 3 2 3 3 2 2 3 1 3 2 3 3 3 3 2 3 3 3 3 3 3 2 3 3 3 3 3 3 2 3 2 3 3 3 3 2 2 2 3 3 2 3 3 3 3 3 3 2 1
[673] 3 3 2 3 3 3 2 3 3 3 3 2 2 3 3 3 2 3 2 2 3 3 2 2 2 3 3 1 3 3 3 3 3 3 3 2 3 3 3 3 2 3 2 2 3 3 3 3 3 3 3 3 3 3 3 3
[729] 3 2 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
table(cities$c)